setwd("~/Dropbox/github/art-data-science/notebook/onion")
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
library(ggmap)
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.
Attaching package: ‘ggmap’
The following object is masked from ‘package:plotly’:
wind
pg.out <- read_html("MonthWiseMarketArrivalsNew.htm")
pg.table <- pg.out %>%
html_node("#dnn_ctr974_MonthWiseMarketArrivals_GridView1") %>%
html_table()
str(pg.table)
'data.frame': 3784 obs. of 7 variables:
$ Market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
$ Month Name : chr "January" "January" "January" "February" ...
$ Year : chr "2014" "2015" "2017" "2014" ...
$ Arrival (q) : int 440 1305 200 1115 1115 1300 920 670 1350 940 ...
$ Price Minimum (Rs/q): chr "1025" "1309" "750" "831" ...
$ Price Maximum (Rs/q): chr "1481" "1858" "1000" "1163" ...
$ Modal Price (Rs/q) : chr "1256" "1613" "850" "983" ...
df <- pg.table
df$quantity <- as.numeric(df$quantity)
df$year <- as.numeric(df$year)
df$priceMin <- as.numeric(df$priceMin)
df$priceMax <- as.numeric(df$priceMax)
df$priceMod <- as.numeric(df$priceMod)
str(df)
'data.frame': 3783 obs. of 7 variables:
$ market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
$ month : chr "January" "January" "January" "February" ...
$ year : num 2014 2015 2017 2014 2015 ...
$ quantity: num 440 1305 200 1115 1115 ...
$ priceMin: num 1025 1309 750 831 1200 ...
$ priceMax: num 1481 1858 1000 1163 1946 ...
$ priceMod: num 1256 1613 850 983 1688 ...
df <- df %>%
mutate(market1 = market) %>%
separate(market1, c("city", "state"), sep = "\\(")
Too many values at 99 locations: 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, ...Too few values at 535 locations: 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, ...
df$state <- df$state %>% str_replace("\\)","")
df <- df %>%
mutate(state = ifelse(is.na(state), market, state))
head(df)
df <- df %>%
mutate(date = paste(month, year, sep="-"))
df$date = as.Date(paste("01-",df$date,sep=""), "%d-%B-%Y")
str(df)
'data.frame': 3783 obs. of 10 variables:
$ market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
$ month : chr "January" "January" "January" "February" ...
$ year : num 2014 2015 2017 2014 2015 ...
$ quantity: num 440 1305 200 1115 1115 ...
$ priceMin: num 1025 1309 750 831 1200 ...
$ priceMax: num 1481 1858 1000 1163 1946 ...
$ priceMod: num 1256 1613 850 983 1688 ...
$ city : chr "ABOHAR" "ABOHAR" "ABOHAR" "ABOHAR" ...
$ state : chr "PB" "PB" "PB" "PB" ...
$ date : Date, format: "2014-01-01" "2015-01-01" ...
df2016 <- df %>%
filter(year == 2016)
head(df2016)
Split-Apply-Combine
df2016City <- df %>%
filter(year == 2016) %>%
group_by(city) %>%
summarise(quantity_year = sum(quantity)) %>%
arrange(desc(quantity_year)) %>%
filter(quantity_year > 2500000)
head(df2016City)
cities <- unique(df2016City$city)
cities
[1] "BANGALORE" "MAHUVA" "PIMPALGAON" "SOLAPUR" "LASALGAON" "PUNE"
[7] "DELHI" "NEWASA" "MUMBAI"
dfCity <- df %>%
filter( city %in% cities)
dim(dfCity)
[1] 348 10
ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
g <- ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
ggplotly(g)
dfCityTall <- dfCity %>%
gather("priceType", "priceValue",5:7) %>%
arrange(date)
ggplot(dfCityTall) + aes(date, y = priceValue, color = priceType) + geom_line() + facet_wrap(~city)
dfBang <- df %>%
filter(city == "BANGALORE") %>%
select(date, priceMod) %>%
arrange(date)
ggplot(dfBang) + aes(date, priceMod) + geom_line()
colnames(dfBang) <- c('ds', 'y')
str(dfBang)
'data.frame': 40 obs. of 2 variables:
$ ds: Date, format: "2014-01-01" "2014-02-01" ...
$ y : num 1094 797 748 712 941 ...
m <- prophet(dfBang)
Disabling weekly seasonality. Run prophet with `weekly.seasonality=TRUE` to override this.
Initial log joint probability = -3.13771
Optimization terminated normally:
Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 12, freq = 'm')
tail(future)
forecast <- predict(m, future)
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(m, forecast)
prophet_plot_components(m, forecast)
uniqcity <- unique(dfCity$city)
geo <- geocode(uniqcity)
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=BANGALORE&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=DELHI&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=LASALGAON&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MAHUVA&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MUMBAI&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NEWASA&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PIMPALGAON&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PUNE&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=SOLAPUR&sensor=false
dfGeo <- bind_cols(df2016City, geo)
dfGeo
ggplot(dfGeo) + aes(lon, lat, size=quantity_year/1000) + geom_point() + coord_map()
map <- get_map("India", zoom = 5)
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
ggmap(map)
map1 <- get_map("India", maptype = "watercolor", source = "stamen", zoom = 5)
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
Map from URL : http://tile.stamen.com/watercolor/5/21/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/15.jpg
ggmap(map1)
ggmap(map1) + geom_point(data = dfGeo,aes(lon,lat,size=quantity_year/1000,color=city))